import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
os.listdir(r"C:\Users\kisha\OneDrive\Desktop\DOCUMENTS\data analyst\projects\Datasets")
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
uber = pd.read_csv(r"C:\Users\kisha\OneDrive\Desktop\DOCUMENTS\data analyst\projects\Datasets\uber-raw-data-janjune-15_sample.csv")
uber
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 |
| ... | ... | ... | ... | ... |
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 |
100000 rows × 4 columns
uber.shape
(100000, 4)
type(uber)
pandas.core.frame.DataFrame
uber.duplicated().sum()
54
uber.drop_duplicates(inplace=True)
uber.duplicated().sum()
uber.shape
(99946, 4)
uber.dtypes
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
uber.isnull().sum()
Dispatching_base_num 0 Pickup_date 0 Affiliated_base_num 1116 locationID 0 dtype: int64
uber['Pickup_date'][0]
'2015-05-02 21:43:00'
type(uber['Pickup_date'][0])
str
uber['Pickup_date']= pd.to_datetime(uber['Pickup_date'])
type(uber['Pickup_date'][0])
pandas._libs.tslibs.timestamps.Timestamp
uber.dtypes
Dispatching_base_num object Pickup_date datetime64[ns] Affiliated_base_num object locationID int64 dtype: object
uber
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 |
| ... | ... | ... | ... | ... |
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 |
99946 rows × 4 columns
uber['Month'] = uber['Pickup_date'].dt.month_name()
uber['Month']
0 May
1 January
2 March
3 April
4 March
...
99995 April
99996 March
99997 March
99998 May
99999 June
Name: Month, Length: 99946, dtype: object
uber['Month'].value_counts().plot(kind='bar')
<Axes: >
uber['WeekDay'] = uber['Pickup_date'].dt.day_name()
uber['Day'] = uber['Pickup_date'].dt.day
uber['Hour'] = uber['Pickup_date'].dt.hour
uber['Minute'] = uber['Pickup_date'].dt.minute
uber.head()
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | Month | WeekDay | Day | Hour | Minute | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 | May | Saturday | 2 | 21 | 43 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 | January | Tuesday | 20 | 19 | 52 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 | March | Thursday | 19 | 20 | 26 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 | April | Friday | 10 | 17 | 38 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 | March | Monday | 23 | 7 | 3 |
pivot_table = pd.crosstab(index=uber['Month'] , columns =uber['WeekDay'] )
pivot_table
| WeekDay | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| Month | |||||||
| April | 2365 | 1833 | 2508 | 2052 | 2823 | 1880 | 2521 |
| February | 2655 | 1970 | 2550 | 2183 | 2396 | 2129 | 2013 |
| January | 2508 | 1353 | 2745 | 1651 | 2378 | 1444 | 1740 |
| June | 2793 | 2848 | 3037 | 2485 | 2767 | 3187 | 2503 |
| March | 2465 | 2115 | 2522 | 2379 | 2093 | 2388 | 2007 |
| May | 3262 | 1865 | 3519 | 2944 | 2627 | 2115 | 2328 |
pivot_table.plot(kind="bar" , figsize=(8,6),title ="Max Uber Pickups in New York City by month and Days" )
<Axes: title={'center': 'Max Uber Pickups in New York City by month and Days'}, xlabel='Month'>
summary = uber.groupby(['WeekDay' , 'Hour'], as_index=False).size()
summary
| WeekDay | Hour | size | |
|---|---|---|---|
| 0 | Friday | 0 | 581 |
| 1 | Friday | 1 | 333 |
| 2 | Friday | 2 | 197 |
| 3 | Friday | 3 | 138 |
| 4 | Friday | 4 | 161 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 1044 |
| 164 | Wednesday | 20 | 897 |
| 165 | Wednesday | 21 | 949 |
| 166 | Wednesday | 22 | 900 |
| 167 | Wednesday | 23 | 669 |
168 rows × 3 columns
plt.figure(figsize(8,6))
sns.pointplot(x="Hour",y="size",hue="WeekDay",data=summary)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[151], line 1 ----> 1 plt.figure(figsize(8,6)) 2 sns.pointplot(x="Hour",y="size",hue="WeekDay",data=summary) NameError: name 'figsize' is not defined
uber.columns
Index(['Dispatching_base_num', 'Pickup_date', 'Affiliated_base_num',
'locationID', 'Month', 'WeekDay', 'Day', 'Hour', 'Minute'],
dtype='object')
os.listdir(r"C:\Users\kisha\OneDrive\Desktop\DOCUMENTS\data analyst\projects\Datasets")
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
uber_foil = pd.read_csv(r"C:\Users\kisha\OneDrive\Desktop\DOCUMENTS\data analyst\projects\Datasets\Uber-Jan-Feb-FOIL.csv")
uber_foil
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
| ... | ... | ... | ... | ... |
| 349 | B02764 | 2/28/2015 | 3952 | 39812 |
| 350 | B02617 | 2/28/2015 | 1372 | 14022 |
| 351 | B02682 | 2/28/2015 | 1386 | 14472 |
| 352 | B02512 | 2/28/2015 | 230 | 1803 |
| 353 | B02765 | 2/28/2015 | 747 | 7753 |
354 rows × 4 columns
uber_foil.shape
(354, 4)
!pip install chart_studio
!pip install plotly
Requirement already satisfied: chart_studio in c:\users\kisha\anaconda3\lib\site-packages (1.1.0) Requirement already satisfied: retrying>=1.3.3 in c:\users\kisha\anaconda3\lib\site-packages (from chart_studio) (1.3.4) Requirement already satisfied: plotly in c:\users\kisha\anaconda3\lib\site-packages (from chart_studio) (5.9.0) Requirement already satisfied: requests in c:\users\kisha\anaconda3\lib\site-packages (from chart_studio) (2.28.1) Requirement already satisfied: six in c:\users\kisha\anaconda3\lib\site-packages (from chart_studio) (1.16.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\kisha\anaconda3\lib\site-packages (from plotly->chart_studio) (8.0.1) Requirement already satisfied: idna<4,>=2.5 in c:\users\kisha\anaconda3\lib\site-packages (from requests->chart_studio) (3.4) Requirement already satisfied: certifi>=2017.4.17 in c:\users\kisha\anaconda3\lib\site-packages (from requests->chart_studio) (2023.5.7) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\kisha\anaconda3\lib\site-packages (from requests->chart_studio) (1.26.14) Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\kisha\anaconda3\lib\site-packages (from requests->chart_studio) (2.0.4) Requirement already satisfied: plotly in c:\users\kisha\anaconda3\lib\site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\kisha\anaconda3\lib\site-packages (from plotly) (8.0.1)
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import download_plotlyjs , init_notebook_mode , plot , iplot
uber_foil.columns
Index(['dispatching_base_number', 'date', 'active_vehicles', 'trips'], dtype='object')
px.box(x='dispatching_base_number' , y='active_vehicles' , data_frame=uber_foil)
px.violin(x='dispatching_base_number' , y='active_vehicles' , data_frame=uber_foil)
files = os.listdir(r"C:\Users\kisha\OneDrive\Desktop\DOCUMENTS\data analyst\projects\Datasets")[-8:]
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files.remove('uber-raw-data-janjune-15.csv')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files.remove('uber-raw-data-janjune-15_sample.csv')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
final = pd.DataFrame()
path = r"C:\Users\kisha\OneDrive\Desktop\DOCUMENTS\data analyst\projects\Datasets"
for file in files :
current_df = pd.read_csv(path+'/'+file)
final = pd.concat([current_df,final])
final.shape
(4534327, 4)
final.duplicated().sum()
82581
final.drop_duplicates(inplace=True)
final.shape
(4451746, 4)
final.head(3)
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
| 1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
| 2 | 9/1/2014 0:03:00 | 40.7559 | -73.9864 | B02512 |
rush_uber = final.groupby(['Lat','Lon'],as_index = False).size()
rush_uber
| Lat | Lon | size | |
|---|---|---|---|
| 0 | 39.6569 | -74.2258 | 1 |
| 1 | 39.6686 | -74.1607 | 1 |
| 2 | 39.7214 | -74.2446 | 1 |
| 3 | 39.8416 | -74.1512 | 1 |
| 4 | 39.9055 | -74.0791 | 1 |
| ... | ... | ... | ... |
| 574553 | 41.3730 | -72.9237 | 1 |
| 574554 | 41.3737 | -73.7988 | 1 |
| 574555 | 41.5016 | -72.8987 | 1 |
| 574556 | 41.5276 | -72.7734 | 1 |
| 574557 | 42.1166 | -72.0666 | 1 |
574558 rows × 3 columns
!pip install folium
Requirement already satisfied: folium in c:\users\kisha\anaconda3\lib\site-packages (0.14.0) Requirement already satisfied: jinja2>=2.9 in c:\users\kisha\anaconda3\lib\site-packages (from folium) (3.1.2) Requirement already satisfied: numpy in c:\users\kisha\anaconda3\lib\site-packages (from folium) (1.23.5) Requirement already satisfied: branca>=0.6.0 in c:\users\kisha\anaconda3\lib\site-packages (from folium) (0.6.0) Requirement already satisfied: requests in c:\users\kisha\anaconda3\lib\site-packages (from folium) (2.28.1) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\kisha\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.1.1) Requirement already satisfied: certifi>=2017.4.17 in c:\users\kisha\anaconda3\lib\site-packages (from requests->folium) (2023.5.7) Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\kisha\anaconda3\lib\site-packages (from requests->folium) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\kisha\anaconda3\lib\site-packages (from requests->folium) (3.4) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\kisha\anaconda3\lib\site-packages (from requests->folium) (1.26.14)
import folium
basemap = folium.Map()
from folium.plugins import HeatMap
HeatMap(rush_uber).add_to(basemap)
<folium.plugins.heat_map.HeatMap at 0x2bb194b9060>
basemap